In [1]:
# importing dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import metrics
import numpy as np
from scipy.stats import skew, kurtosis
from scipy import stats
import re
In [2]:
#Data EDA and processing
df = pd.read_csv('Car details v3.csv')
df.head()
Out[2]:
| name | year | selling_price | km_driven | fuel | seller_type | transmission | owner | mileage | engine | max_power | torque | seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Maruti Swift Dzire VDI | 2014 | 450000 | 145500 | Diesel | Individual | Manual | First Owner | 23.4 kmpl | 1248 CC | 74 bhp | 190Nm@ 2000rpm | 5.0 |
| 1 | Skoda Rapid 1.5 TDI Ambition | 2014 | 370000 | 120000 | Diesel | Individual | Manual | Second Owner | 21.14 kmpl | 1498 CC | 103.52 bhp | 250Nm@ 1500-2500rpm | 5.0 |
| 2 | Honda City 2017-2020 EXi | 2006 | 158000 | 140000 | Petrol | Individual | Manual | Third Owner | 17.7 kmpl | 1497 CC | 78 bhp | 12.7@ 2,700(kgm@ rpm) | 5.0 |
| 3 | Hyundai i20 Sportz Diesel | 2010 | 225000 | 127000 | Diesel | Individual | Manual | First Owner | 23.0 kmpl | 1396 CC | 90 bhp | 22.4 kgm at 1750-2750rpm | 5.0 |
| 4 | Maruti Swift VXI BSIII | 2007 | 130000 | 120000 | Petrol | Individual | Manual | First Owner | 16.1 kmpl | 1298 CC | 88.2 bhp | 11.5@ 4,500(kgm@ rpm) | 5.0 |
In [3]:
print("Shape:", df.shape)
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
Shape: (8128, 13) Number of Rows: 8128 Number of Columns: 13
In [4]:
display(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8128 entries, 0 to 8127 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 8128 non-null object 1 year 8128 non-null int64 2 selling_price 8128 non-null int64 3 km_driven 8128 non-null int64 4 fuel 8128 non-null object 5 seller_type 8128 non-null object 6 transmission 8128 non-null object 7 owner 8128 non-null object 8 mileage 7907 non-null object 9 engine 7907 non-null object 10 max_power 7913 non-null object 11 torque 7906 non-null object 12 seats 7907 non-null float64 dtypes: float64(1), int64(3), object(9) memory usage: 825.6+ KB
None
In [5]:
df.isnull().sum()
Out[5]:
name 0 year 0 selling_price 0 km_driven 0 fuel 0 seller_type 0 transmission 0 owner 0 mileage 221 engine 221 max_power 215 torque 222 seats 221 dtype: int64
In [6]:
df = df.dropna(subset=[
"engine",
"max_power",
"mileage"
])
In [7]:
df["seats"].fillna(df["seats"].mode()[0], inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\356958492.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df["seats"].fillna(df["seats"].mode()[0], inplace=True)
In [8]:
print(df.isnull().sum())
name 0 year 0 selling_price 0 km_driven 0 fuel 0 seller_type 0 transmission 0 owner 0 mileage 0 engine 0 max_power 0 torque 1 seats 0 dtype: int64
In [9]:
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical Columns:", numerical_cols)
Numerical Columns: ['year', 'selling_price', 'km_driven', 'seats']
In [10]:
df[numerical_cols].describe().T
Out[10]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| year | 7907.0 | 2013.982168 | 3.866650 | 1994.0 | 2012.0 | 2015.0 | 2017.0 | 2020.0 |
| selling_price | 7907.0 | 649741.656380 | 813556.530537 | 29999.0 | 270000.0 | 450000.0 | 690000.0 | 10000000.0 |
| km_driven | 7907.0 | 69192.556469 | 56789.761602 | 1.0 | 35000.0 | 60000.0 | 95750.0 | 2360457.0 |
| seats | 7907.0 | 5.416719 | 0.959588 | 2.0 | 5.0 | 5.0 | 5.0 | 14.0 |
In [11]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical Columns:", categorical_cols)
Categorical Columns: ['name', 'fuel', 'seller_type', 'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque']
In [12]:
for col in categorical_cols:
print(f"\nColumn: {col}")
print(f"Unique values in '{col}': {df[col].unique()}")
print(df[col].describe())
Column: name Unique values in 'name': ['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition' 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT' 'Ford Freestyle Titanium Plus Diesel BSIV' 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV'] count 7907 unique 1983 top Maruti Swift Dzire VDI freq 129 Name: name, dtype: object Column: fuel Unique values in 'fuel': ['Diesel' 'Petrol' 'LPG' 'CNG'] count 7907 unique 4 top Diesel freq 4299 Name: fuel, dtype: object Column: seller_type Unique values in 'seller_type': ['Individual' 'Dealer' 'Trustmark Dealer'] count 7907 unique 3 top Individual freq 6564 Name: seller_type, dtype: object Column: transmission Unique values in 'transmission': ['Manual' 'Automatic'] count 7907 unique 2 top Manual freq 6866 Name: transmission, dtype: object Column: owner Unique values in 'owner': ['First Owner' 'Second Owner' 'Third Owner' 'Fourth & Above Owner' 'Test Drive Car'] count 7907 unique 5 top First Owner freq 5215 Name: owner, dtype: object Column: mileage Unique values in 'mileage': ['23.4 kmpl' '21.14 kmpl' '17.7 kmpl' '23.0 kmpl' '16.1 kmpl' '20.14 kmpl' '17.3 km/kg' '23.59 kmpl' '20.0 kmpl' '19.01 kmpl' '17.3 kmpl' '19.3 kmpl' '18.9 kmpl' '18.15 kmpl' '24.52 kmpl' '19.7 kmpl' '22.54 kmpl' '21.0 kmpl' '25.5 kmpl' '26.59 kmpl' '21.5 kmpl' '20.3 kmpl' '21.4 kmpl' '24.7 kmpl' '18.2 kmpl' '16.8 kmpl' '24.3 kmpl' '14.0 kmpl' '18.6 kmpl' '33.44 km/kg' '23.95 kmpl' '17.0 kmpl' '20.63 kmpl' '13.93 kmpl' '16.0 kmpl' '17.8 kmpl' '18.5 kmpl' '12.55 kmpl' '12.99 kmpl' '14.8 kmpl' '13.5 kmpl' '26.0 kmpl' '20.65 kmpl' '27.3 kmpl' '11.36 kmpl' '17.68 kmpl' '14.28 kmpl' '18.53 kmpl' '14.84 kmpl' '21.12 kmpl' '20.36 kmpl' '21.27 kmpl' '18.16 kmpl' '22.0 kmpl' '25.1 kmpl' '20.51 kmpl' '21.66 kmpl' '25.2 kmpl' '22.9 kmpl' '16.02 kmpl' '20.54 kmpl' '22.77 kmpl' '15.71 kmpl' '23.1 kmpl' '19.02 kmpl' '19.81 kmpl' '26.2 km/kg' '16.47 kmpl' '15.04 kmpl' '19.1 kmpl' '21.79 kmpl' '18.8 kmpl' '21.21 kmpl' '15.37 kmpl' '11.79 kmpl' '19.0 kmpl' '14.3 kmpl' '15.8 kmpl' '15.1 kmpl' '19.09 kmpl' '22.32 kmpl' '21.9 kmpl' '14.53 kmpl' '21.63 kmpl' '20.85 kmpl' '20.45 kmpl' '19.67 kmpl' '23.01 kmpl' '20.77 kmpl' '17.92 kmpl' '17.01 kmpl' '22.37 kmpl' '19.33 kmpl' '9.5 kmpl' '12.83 kmpl' '22.48 kmpl' '16.78 kmpl' '14.67 kmpl' '15.0 kmpl' '13.96 kmpl' '18.0 kmpl' '12.07 kmpl' '26.21 kmpl' '10.8 kmpl' '16.3 kmpl' '13.6 kmpl' '14.74 kmpl' '15.6 kmpl' '19.56 kmpl' '22.69 kmpl' '19.16 kmpl' '18.12 kmpl' '12.1 kmpl' '17.5 kmpl' '42.0 kmpl' '20.4 kmpl' '21.1 kmpl' '19.44 kmpl' '13.0 kmpl' '21.43 kmpl' '22.95 kmpl' '16.2 kmpl' '15.3 kmpl' '28.09 kmpl' '17.4 kmpl' '19.4 kmpl' '26.6 km/kg' '17.6 kmpl' '28.4 kmpl' '14.1 kmpl' '25.17 kmpl' '22.74 kmpl' '17.57 kmpl' '16.95 kmpl' '19.49 kmpl' '17.21 kmpl' '13.2 kmpl' '14.2 kmpl' '26.8 kmpl' '25.4 kmpl' '11.5 kmpl' '27.28 kmpl' '17.97 kmpl' '12.8 kmpl' '16.55 kmpl' '12.05 kmpl' '14.07 kmpl' '21.02 kmpl' '11.57 kmpl' '17.9 kmpl' '15.96 kmpl' '17.1 kmpl' '17.19 kmpl' '21.01 kmpl' '24.0 kmpl' '25.6 kmpl' '21.38 kmpl' '23.84 kmpl' '23.08 kmpl' '14.24 kmpl' '20.71 kmpl' '15.64 kmpl' '14.5 kmpl' '16.34 kmpl' '27.39 kmpl' '11.1 kmpl' '13.9 kmpl' '20.88 km/kg' '20.92 kmpl' '23.8 kmpl' '24.4 kmpl' '15.29 kmpl' '21.19 kmpl' '22.5 kmpl' '19.6 kmpl' '23.65 kmpl' '25.32 kmpl' '23.5 kmpl' '16.6 kmpl' '23.9 kmpl' '20.8 kmpl' '27.62 kmpl' '12.9 kmpl' '25.44 kmpl' '17.88 kmpl' '22.7 kmpl' '17.2 kmpl' '15.42 kmpl' '19.68 kmpl' '18.7 kmpl' '15.4 kmpl' '19.34 kmpl' '22.71 kmpl' '25.8 kmpl' '13.7 kmpl' '12.2 kmpl' '18.49 kmpl' '9.0 kmpl' '0.0 kmpl' '13.58 kmpl' '10.1 kmpl' '20.5 kmpl' '25.0 kmpl' '10.5 kmpl' '22.07 kmpl' '22.3 kmpl' '15.26 kmpl' '20.62 kmpl' '27.4 kmpl' '23.2 kmpl' '14.4 kmpl' '18.4 kmpl' '30.46 km/kg' '14.02 kmpl' '11.0 kmpl' '20.6 kmpl' '22.05 kmpl' '20.2 kmpl' '18.1 kmpl' '22.1 kmpl' '19.87 kmpl' '13.01 kmpl' '18.06 kmpl' '26.1 kmpl' '16.52 kmpl' '13.55 kmpl' '24.2 kmpl' '25.83 kmpl' '11.2 kmpl' '17.09 kmpl' '21.03 kmpl' '17.45 kmpl' '21.64 kmpl' '21.94 km/kg' '13.87 kmpl' '19.98 kmpl' '20.52 kmpl' '23.57 kmpl' '11.7 kmpl' '17.43 kmpl' '18.88 kmpl' '13.68 kmpl' '11.18 kmpl' '20.89 kmpl' '11.8 kmpl' '19.62 kmpl' '21.7 kmpl' '14.9 kmpl' '19.5 kmpl' '10.91 kmpl' '15.7 kmpl' '20.73 kmpl' '15.85 kmpl' '20.7 kmpl' '14.23 kmpl' '16.5 kmpl' '17.36 kmpl' '12.6 kmpl' '16.36 kmpl' '14.95 kmpl' '16.9 kmpl' '19.2 kmpl' '16.96 kmpl' '22.15 kmpl' '18.78 kmpl' '19.61 kmpl' '17.71 kmpl' '18.3 kmpl' '19.12 kmpl' '19.72 kmpl' '12.0 kmpl' '11.4 kmpl' '23.03 kmpl' '11.07 kmpl' '15.9 kmpl' '17.67 kmpl' '20.46 kmpl' '13.1 kmpl' '13.45 km/kg' '24.8 kmpl' '15.73 kmpl' '15.11 kmpl' '12.7 kmpl' '21.2 kmpl' '20.38 kmpl' '21.56 kmpl' '13.22 kmpl' '14.49 kmpl' '15.05 kmpl' '23.26 kmpl' '15.41 kmpl' '13.8 kmpl' '22.27 kmpl' '32.52 km/kg' '14.66 kmpl' '12.12 kmpl' '16.84 kmpl' '14.09 kmpl' '14.7 kmpl' '13.4 kmpl' '15.5 kmpl' '13.49 kmpl' '11.88 km/kg' '14.6 kmpl' '10.75 kmpl' '24.5 kmpl' '11.74 kmpl' '16.07 kmpl' '15.63 kmpl' '26.3 km/kg' '23.7 km/kg' '25.47 kmpl' '17.05 kmpl' '23.3 kmpl' '11.9 kmpl' '13.38 kmpl' '20.86 kmpl' '19.2 km/kg' '10.9 kmpl' '18.25 kmpl' '15.2 kmpl' '20.37 kmpl' '17.8 km/kg' '21.8 kmpl' '11.96 kmpl' '24.04 kmpl' '19.69 kmpl' '13.73 kmpl' '21.04 kmpl' '25.01 kmpl' '10.93 kmpl' '10.9 km/kg' '24.29 kmpl' '13.44 kmpl' '20.07 kmpl' '21.1 km/kg' '19.08 kmpl' '20.34 kmpl' '11.68 kmpl' '12.5 kmpl' '12.3 kmpl' '23.87 kmpl' '16.38 kmpl' '17.42 kmpl' '10.0 kmpl' '18.24 kmpl' '10.71 kmpl' '19.59 kmpl' '16.7 kmpl' '19.83 kmpl' '21.76 kmpl' '16.05 kmpl' '20.28 kmpl' '16.25 kmpl' '16.73 kmpl' '18.48 kmpl' '13.2 km/kg' '21.4 km/kg' '14.99 kmpl' '18.76 kmpl' '16.4 kmpl' '19.64 kmpl' '14.94 kmpl' '16.6 km/kg' '16.0 km/kg' '17.11 kmpl' '22.8 km/kg' '32.26 km/kg' '33.0 km/kg' '12.4 kmpl' '18.44 kmpl' '16.09 kmpl' '19.0 km/kg' '12.62 kmpl' '21.13 kmpl' '15.17 kmpl' '21.73 kmpl' '21.72 kmpl' '12.85 kmpl' '14.81 kmpl' '13.24 kmpl' '14.4 km/kg' '21.49 kmpl' '14.62 kmpl' '26.83 km/kg' '11.45 kmpl' '12.08 kmpl' '15.74 kmpl' '11.3 kmpl' '15.1 km/kg' '14.21 kmpl' '11.72 kmpl' '16.51 kmpl'] count 7907 unique 393 top 18.9 kmpl freq 225 Name: mileage, dtype: object Column: engine Unique values in 'engine': ['1248 CC' '1498 CC' '1497 CC' '1396 CC' '1298 CC' '1197 CC' '1061 CC' '796 CC' '1364 CC' '1399 CC' '1461 CC' '993 CC' '1198 CC' '1199 CC' '998 CC' '1591 CC' '2179 CC' '1368 CC' '2982 CC' '2494 CC' '2143 CC' '2477 CC' '1462 CC' '2755 CC' '1968 CC' '1798 CC' '1196 CC' '1373 CC' '1598 CC' '1998 CC' '1086 CC' '1194 CC' '1172 CC' '1405 CC' '1582 CC' '999 CC' '2487 CC' '1999 CC' '3604 CC' '2987 CC' '1995 CC' '1451 CC' '1969 CC' '2967 CC' '2497 CC' '1797 CC' '1991 CC' '2362 CC' '1493 CC' '1599 CC' '1341 CC' '1794 CC' '799 CC' '1193 CC' '2696 CC' '1495 CC' '1186 CC' '1047 CC' '2498 CC' '2956 CC' '2523 CC' '1120 CC' '624 CC' '1496 CC' '1984 CC' '2354 CC' '814 CC' '793 CC' '1799 CC' '936 CC' '1956 CC' '1997 CC' '1499 CC' '1948 CC' '2997 CC' '2489 CC' '2499 CC' '2609 CC' '2953 CC' '1150 CC' '1994 CC' '1388 CC' '1527 CC' '2199 CC' '995 CC' '2993 CC' '1586 CC' '1390 CC' '909 CC' '2393 CC' '3198 CC' '1339 CC' '2835 CC' '2092 CC' '1595 CC' '2496 CC' '1596 CC' '1597 CC' '2596 CC' '2148 CC' '1299 CC' '1590 CC' '2231 CC' '2694 CC' '2200 CC' '1795 CC' '1896 CC' '1796 CC' '1422 CC' '1489 CC' '2359 CC' '2197 CC' '2999 CC' '1781 CC' '2650 CC' '1343 CC' '2446 CC' '3498 CC' '2198 CC' '2776 CC' '1950 CC'] count 7907 unique 121 top 1248 CC freq 1017 Name: engine, dtype: object Column: max_power Unique values in 'max_power': ['74 bhp' '103.52 bhp' '78 bhp' '90 bhp' '88.2 bhp' '81.86 bhp' '57.5 bhp' '37 bhp' '67.1 bhp' '68.1 bhp' '108.45 bhp' '60 bhp' '73.9 bhp' '67 bhp' '82 bhp' '88.5 bhp' '46.3 bhp' '88.73 bhp' '64.1 bhp' '98.6 bhp' '88.8 bhp' '83.81 bhp' '83.1 bhp' '47.3 bhp' '73.8 bhp' '34.2 bhp' '35 bhp' '81.83 bhp' '40.3 bhp' '121.3 bhp' '138.03 bhp' '160.77 bhp' '117.3 bhp' '116.3 bhp' '83.14 bhp' '67.05 bhp' '168.5 bhp' '100 bhp' '120.7 bhp' '98.63 bhp' '175.56 bhp' '103.25 bhp' '171.5 bhp' '100.6 bhp' '174.33 bhp' '187.74 bhp' '170 bhp' '78.9 bhp' '88.76 bhp' '86.8 bhp' '108.495 bhp' '108.62 bhp' '93.7 bhp' '103.6 bhp' '98.59 bhp' '189 bhp' '67.04 bhp' '68.05 bhp' '58.2 bhp' '82.85 bhp' '81.80 bhp' '73 bhp' '120 bhp' '94.68 bhp' '160 bhp' '65 bhp' '155 bhp' '69.01 bhp' '126.32 bhp' '138.1 bhp' '83.8 bhp' '126.2 bhp' '98.96 bhp' '62.1 bhp' '86.7 bhp' '188 bhp' '214.56 bhp' '177 bhp' '280 bhp' '148.31 bhp' '254.79 bhp' '190 bhp' '177.46 bhp' '204 bhp' '141 bhp' '117.6 bhp' '241.4 bhp' '282 bhp' '150 bhp' '147.5 bhp' '108.5 bhp' '103.5 bhp' '183 bhp' '181.04 bhp' '157.7 bhp' '164.7 bhp' '91.1 bhp' '400 bhp' '68 bhp' '75 bhp' '85.8 bhp' '87.2 bhp' '53 bhp' '118 bhp' '103.2 bhp' '83 bhp' '84 bhp' '58.16 bhp' '147.94 bhp' '74.02 bhp' '53.3 bhp' '80 bhp' '88.7 bhp' '97.7 bhp' '121.36 bhp' '162 bhp' '140 bhp' '94 bhp' '100.57 bhp' '82.9 bhp' '83.11 bhp' '70 bhp' '153.86 bhp' '121 bhp' '126.3 bhp' '73.97 bhp' '171 bhp' '69 bhp' '99.6 bhp' '102 bhp' '105 bhp' '63 bhp' '79.4 bhp' '97.9 bhp' '63.1 bhp' '66.1 bhp' '110 bhp' '174.5 bhp' '53.26 bhp' '73.75 bhp' '67.06 bhp' '64.08 bhp' '37.5 bhp' '189.3 bhp' '158.8 bhp' '61.7 bhp' '55.2 bhp' '71.01 bhp' '73.74 bhp' '147.9 bhp' '71 bhp' '77 bhp' '121.4 bhp' '113.4 bhp' '47 bhp' '130 bhp' '57.6 bhp' '138 bhp' '52.8 bhp' '53.64 bhp' '53.5 bhp' '76.8 bhp' '82.4 bhp' '113.42 bhp' '76 bhp' '84.8 bhp' '56.3 bhp' '218 bhp' '112 bhp' '92 bhp' '105.5 bhp' '169 bhp' '95 bhp' '72.4 bhp' '115 bhp' '152 bhp' '91.2 bhp' '156 bhp' '74.9 bhp' '62 bhp' '105.3 bhp' '73.94 bhp' '85.80 bhp' '85 bhp' '118.3 bhp' '72 bhp' '147.51 bhp' '58 bhp' '64 bhp' '126.24 bhp' '76.9 bhp' '194.3 bhp' '99.23 bhp' '89.84 bhp' '123.7 bhp' '118.35 bhp' '99 bhp' '241 bhp' '136 bhp' '261.4 bhp' '104.68 bhp' '37.48 bhp' '104 bhp' '88.50 bhp' '63.12 bhp' '91.7 bhp' '102.5 bhp' '177.6 bhp' '45 bhp' '123.37 bhp' '147.8 bhp' '184 bhp' '84.48 bhp' '68.07 bhp' '74.96 bhp' '167.6 bhp' '152.87 bhp' '112.2 bhp' '83.83 bhp' '197 bhp' '110.4 bhp' '104.55 bhp' '103 bhp' '103.3 bhp' '66 bhp' '108.6 bhp' '165 bhp' '163.7 bhp' '116.9 bhp' '94.93 bhp' '127 bhp' '198.5 bhp' '179.5 bhp' '120.69 bhp' '121.31 bhp' '138.08 bhp' '187.7 bhp' '80.8 bhp' '86.79 bhp' '93.87 bhp' '116.6 bhp' '143 bhp' '92.7 bhp' '88 bhp' '58.33 bhp' '78.8 bhp' '64.4 bhp' '125 bhp' '139.01 bhp' '254.8 bhp' '181 bhp' '258 bhp' '55.23 bhp' '270.9 bhp' '265 bhp' '157.75 bhp' '101 bhp' '186 bhp' '187.4 bhp' '224 bhp' '64.9 bhp' '148 bhp' '35.5 bhp' '89.75 bhp' '32.8 bhp' '91.72 bhp' '106 bhp' '98.97 bhp' '66.6 bhp' '86 bhp' '65.3 bhp' '98.82 bhp' '198.25 bhp' '38 bhp' '142 bhp' '132 bhp' '174.57 bhp' '178 bhp' '163.2 bhp' '203.2 bhp' '177.5 bhp' '175 bhp' '57 bhp' '80.84 bhp' '68.4 bhp' '167.67 bhp' '170.63 bhp' '52 bhp' '149.5 bhp' '48.21 bhp' ' bhp' '201.1 bhp' '100.5 bhp' '144 bhp' '194.4 bhp' '168.7 bhp' '104.5 bhp' '103.26 bhp' '116.4 bhp' '98.79 bhp' '80.9 bhp' '58.3 bhp' '272 bhp' '235 bhp' '167.62 bhp' '170.30 bhp' '139.46 bhp' '158 bhp' '110.5 bhp' '82.5 bhp' '141.1 bhp' '38.4 bhp' '197.2 bhp' '161 bhp' '194 bhp' '122.4 bhp' '134.10 bhp' '60.2 bhp' '134 bhp' '203 bhp' '135.1 bhp'] count 7907 unique 321 top 74 bhp freq 377 Name: max_power, dtype: object Column: torque Unique values in 'torque': ['190Nm@ 2000rpm' '250Nm@ 1500-2500rpm' '12.7@ 2,700(kgm@ rpm)' '22.4 kgm at 1750-2750rpm' '11.5@ 4,500(kgm@ rpm)' '113.75nm@ 4000rpm' '7.8@ 4,500(kgm@ rpm)' '59Nm@ 2500rpm' '170Nm@ 1800-2400rpm' '160Nm@ 2000rpm' '248Nm@ 2250rpm' '78Nm@ 4500rpm' '84Nm@ 3500rpm' '115Nm@ 3500-3600rpm' '200Nm@ 1750rpm' '62Nm@ 3000rpm' '219.7Nm@ 1500-2750rpm' '114Nm@ 3500rpm' '115Nm@ 4000rpm' '69Nm@ 3500rpm' '172.5Nm@ 1750rpm' '6.1kgm@ 3000rpm' '114.7Nm@ 4000rpm' '60Nm@ 3500rpm' '90Nm@ 3500rpm' '151Nm@ 4850rpm' '104Nm@ 4000rpm' '320Nm@ 1700-2700rpm' '250Nm@ 1750-2500rpm' '145Nm@ 4600rpm' '146Nm@ 4800rpm' '343Nm@ 1400-3400rpm' '200Nm@ 1400-3400rpm' '200Nm@ 1250-4000rpm' '400Nm@ 2000-2500rpm' '138Nm@ 4400rpm' '360Nm@ 1200-3400rpm' '200Nm@ 1200-3600rpm' '380Nm@ 1750-2500rpm' '173Nm@ 4000rpm' '400Nm@ 1750-3000rpm' '400Nm@ 1400-2800rpm' '200Nm@ 1750-3000rpm' '111.7Nm@ 4000rpm' '219.6Nm@ 1500-2750rpm' '112Nm@ 4000rpm' '250Nm@ 1500-3000rpm' '130Nm@ 4000rpm' '205Nm@ 1750-3250rpm' '280Nm@ 1350-4600rpm' '99.04Nm@ 4500rpm' '77Nm@ 3500rpm' '110Nm@ 3750rpm' '153Nm@ 3800rpm' '113.7Nm@ 4000rpm' '114Nm@ 4000rpm' '113Nm@ 4200rpm' '101Nm@ 3000rpm' '290Nm@ 1800-2800rpm' '120Nm@ 4250rpm' '250Nm@ 1500~4500rpm' '96 Nm at 3000 rpm' '360Nm@ 1750-2800rpm' '135Nm@ 2500rpm' '259.8Nm@ 1900-2750rpm' '200Nm@ 1900rpm' '259.9Nm@ 1900-2750rpm' '91Nm@ 4250rpm' '96.1Nm@ 3000rpm' '109Nm@ 4500rpm' '400nm@ 1750-3000rpm' '202Nm@ 3600-5200rpm' '430Nm@ 1750-2500rpm' '347Nm@ 4300rpm' '382nm@ 1750-2250rpm' '620Nm@ 1600-2400rpm' '400Nm@ 1750-2500rpm' '250@ 1250-5000rpm' '500Nm@ 1600-1800rpm' '250Nm@ 1600-3600rpm' '400Nm' '550Nm@ 1750-2750rpm' '490Nm@ 1600rpm' '250 Nm at 2750 rpm' '177.5Nm@ 4700rpm' '170Nm@ 1750-4000rpm' '300Nm@ 1200-4000rpm' '300Nm@ 1200-1400rpm' '260Nm@ 1500-2750rpm' '213Nm@ 4500rpm' '224Nm@ 4000rpm' '640Nm@ 1740rpm' '113Nm@ 4500rpm' '95Nm@ 3000-4300rpm' '13.1kgm@ 4600rpm' '205Nm@ 1800-2800rpm' '71Nm@ 3500rpm' '190Nm@ 1750-3000rpm' '146Nm at 4800 rpm' '14.9 KGM at 3000 RPM' '115Nm@ 3200rpm' '117nm@ 4000rpm' '320Nm@ 1500-3000rpm' '72Nm@ 4386rpm' '11.4 kgm at 4,000 rpm' '140Nm@ 1500-4000rpm' '134Nm@ 4000rpm' '150Nm@ 4500rpm' '340Nm@ 1800-3250rpm' '240Nm@ 1600-2800rpm' '330Nm@ 1600-2800rpm' '12.5@ 3,500(kgm@ rpm)' '110Nm@ 4800rpm' '111.8Nm@ 4000rpm' '11.8@ 3,200(kgm@ rpm)' '135.4Nm@ 2500rpm' '300Nm@ 1750-2500rpm' '190.25nm@ 1750-2250rpm' '140Nm@ 1800-3000rpm' '20.4@ 1400-3400(kgm@ rpm)' '247Nm@ 1800-2000rpm' '223Nm@ 1600-2200rpm' '180 Nm at 1440-1500rpm' '195Nm@ 1400-2200rpm' '154.9Nm@ 4200rpm' '114.73Nm@ 4000rpm' '160Nm@ 1500-2750rpm' '108Nm@ 4400rpm' '190.24nm@ 1750-2250rpm' '200Nm@ 2000-3500rpm' '420Nm@ 1400-2600rpm' '100Nm@ 2700rpm' '51Nm@ 4000rpm' '250Nm@ 1250-5300rpm' '132Nm@ 3000rpm' '350Nm@ 1500-2750rpm' '218Nm@ 4200rpm' '14.9@ 3,000(kgm@ rpm)' '24@ 1,900-2,750(kgm@ rpm)' '13.5@ 2,500(kgm@ rpm)' '85Nm@ 3000rpm' '74.5Nm@ 4000rpm' '160Nm@ 1750rpm' '180.4Nm@ 1750-2500rpm' '230Nm@ 1500-2500rpm' '113.75Nm@ 4000rpm' '219.66nm@ 1500-2750rpm' '245Nm@ 1750rpm' '360Nm@ 1400-3200rpm' '320Nm@ 2000rpm' '135 Nm at 2500 rpm ' '24 KGM at 1900-2750 RPM' '190Nm@ 1750-2250rpm' '204Nm@ 2000-2750rpm' '14.3@ 1,800-3,000(kgm@ rpm)' '250nm@ 1500-2750rpm' '125Nm@ 2000rpm' '172Nm@ 4300rpm' '150Nm@ 1750rpm' '102Nm@ 4000rpm' '85Nm@ 2500rpm' '8.5@ 2,500(kgm@ rpm)' '180Nm@ 1440-1500rpm' '106.5Nm@ 4400rpm' '108.5Nm@ 5000rpm' '350Nm@ 1750-2500rpm' '144.15nm@ 4500rpm' '104Nm@ 4400rpm' '99Nm@ 4500rpm' '200Nm@ 2000rpm' '280Nm@ 1800-2800rpm' '142.5Nm@ 1750rpm' '140Nm@ 4400rpm' '115@ 2,500(kgm@ rpm)' '196Nm@ 5000rpm' '260 Nm at 1800-2200 rpm' '9.8@ 3,000(kgm@ rpm)' '209Nm@ 2000rpm' '135 Nm at 2500 rpm' '140Nm@ 4200rpm' '220Nm at 1400-2600 rpm' '48Nm@ 3000rpm' '171Nm@ 1800rpm' '277.5Nm@ 1700-2200rpm' '215Nm@ 3600rpm' '219.6Nm@ 1750-2750rpm' '195Nm@ 1440-2200rpm' '13@ 2,500(kgm@ rpm)' '180Nm@ 2000rpm' '200Nm@ 1400-2200rpm' '380Nm(38.7kgm)@ 2500rpm' '110Nm@ 4400rpm' '72Nm@ 4388rpm' '263.7Nm@ 2500rpm' '320Nm@ 1600-2800rpm' '25.5@ 1,500-3,000(kgm@ rpm)' '16.3@ 2,000(kgm@ rpm)' '190 Nm at 1750 rpm ' '94.14Nm@ 3500rpm' '12@ 3,500(kgm@ rpm)' '113Nm@ 5000rpm' '280Nm@ 2400-2800rpm' '96Nm@ 3500rpm' '16@ 2,000(kgm@ rpm)' '320Nm@ 1750-3000rpm' '114.73nm@ 4000rpm' '320Nm@ 1750-2500rpm' '138nm@ 4400rpm' '190Nm@ 1750rpm' '789Nm@ 2250rpm' '259.87Nm@ 1900-2750rpm' '205Nm@ 1750rpm' '436.39Nm@ 1800-2500rpm' '182.5Nm@ 1500-1800rpm' '90.3Nm@ 4200rpm' '12.5@ 2,500(kgm@ rpm)' '215Nm@ 1750-3000rpm' '215Nm@ 1750-3000' '305Nm@ 2000rpm' '540Nm@ 2000rpm' '327Nm@ 2600rpm' '300Nm@ 1600-3000rpm' '620Nm@ 2000-2500rpm' '450Nm@ 1600-2400rpm' '19@ 1,800(kgm@ rpm)' '9.2@ 4,200(kgm@ rpm)' '145@ 4,100(kgm@ rpm)' '51Nm@ 4000+/-500rpm' '110Nm@ 3000rpm' '148Nm@ 3500rpm' '116Nm@ 4750rpm' '48@ 3,000+/-500(NM@ rpm)' '148Nm@ 4000rpm' '222Nm@ 4300rpm' '135.3Nm@ 5000rpm' '98Nm@ 1600-3000rpm' '170Nm@ 1400-4500rpm' '343Nm@ 1400-2800rpm' '402Nm@ 1600-3000rpm' '113Nm@ 3300rpm' '99.07Nm@ 4500rpm' '210nm@ 1600-2200rpm' '190 Nm at 1750 rpm ' '32.1kgm@ 2000rpm' '224nm@ 1500-2750rpm' '400nm@ 1750-2500rpm' '215Nm@ 1750-2500rpm' '25@ 1,800-2,800(kgm@ rpm)' '197Nm@ 1750rpm' '136.3Nm@ 4200rpm' '470Nm@ 1750-2500rpm' '11@ 3,000(kgm@ rpm)' '142Nm@ 4000rpm' '145Nm@ 4100rpm' '320Nm@ 1500-2800rpm' '123Nm@ 1000-2500rpm' '218Nm@ 1400-2600rpm' '510@ 1600-2400' '220Nm@ 1500-2750rpm' '380Nm@ 2000rpm' '104Nm@ 3100rpm' '292Nm@ 2000rpm' '20@ 3,750(kgm@ rpm)' '46.5@ 1,400-2,800(kgm@ rpm)' '380Nm@ 2500rpm' '15@ 3,800(kgm@ rpm)' '136Nm@ 4250rpm' '228Nm@ 4400rpm' '149Nm@ 4500rpm' '187Nm@ 2500rpm' '146Nm@ 3400rpm' '8.6@ 3,500(kgm@ rpm)' '219.7Nm@ 1750-2750rpm' '190Nm@ 2000-3000' '450Nm@ 2000rpm' '300Nm@ 2000rpm' '230Nm@ 1800-2000rpm' '42@ 2,000(kgm@ rpm)' '110Nm@ 3000-4300rpm' '110(11.2)@ 4800' '330Nm@ 1800rpm' '225Nm@ 1500-2500rpm' '380Nm@ 1750-2750rpm' '28.3@ 1,700-2,200(kgm@ rpm)' '259.88Nm@ 1900-2750rpm' '580Nm@ 1400-3250rpm' '400 Nm /2000 rpm' '127Nm@ 3500rpm' '300Nm@ 1500-2500rpm' '132.3Nm@ 4000rpm' '113nm@ 4400rpm' '151NM@ 4850rpm' '153Nm@ 3750-3800rpm' '10.7@ 2,500(kgm@ rpm)' '124.6Nm@ 3500rpm' '78Nm@ 3500rpm' '219.9Nm@ 1750-2750rpm' '420.7Nm@ 1800-2500rpm' '130Nm@ 3000rpm' '424Nm@ 2000rpm' '130@ 2500(kgm@ rpm)' '99.8Nm@ 2700rpm' '113Nm@ 4,500rpm' '11.2@ 4,400(kgm@ rpm)' '240Nm@ 1850rpm' '16.1@ 4,200(kgm@ rpm)' '320Nm@ 1750-2700rpm' '115Nm@ 4500rpm' '245Nm@ 4000rpm' '321Nm@ 1600-2400rpm' '619Nm@ 1600-2400rpm' '380Nm@ 1750-3000rpm' '560Nm@ 1500rpm' '230Nm@ 1500-2250rpm' '90Nm@ 2650rpm' '260Nm@ 1800-2200rpm' '600Nm@ 2000rpm' '259.87nm@ 1500-3000rpm' '16.6@ 4,500(kgm@ rpm)' '219.66NM@ 1500-2750rpm' '12.5@ 3,000(kgm@ rpm)' '620Nm@ 1500-2500rpm' '250Nm@ 1500-4500rpm' '14.9@ 3,400(kgm@ rpm)' '25.5@ 1,900(kgm@ rpm)' '33.7@ 1,800(kgm@ rpm)' '285Nm@ 2400-4000rpm' '10.7@ 2,600(kgm@ rpm)' '250Nm@ 1000-2000rpm' '240Nm@ 1750rpm' '226Nm@ 4400rpm' '510Nm@ 1600-2800rpm' '259.87NM@ 1500-3000rpm' '155 Nm at 1600-2800 rpm' '240Nm@ 2000rpm' '103Nm@ 4500rpm' '13.5@ 4,800(kgm@ rpm)' '400Nm@ 1750-2750rpm' '175Nm@ 1500-4100rpm' '72.9Nm@ 2250rpm' '135.4Nm@ 2500' '245Nm@ 5000rpm' '57Nm@ 2500rpm' '96Nm@ 2500rpm' '215nm@ 1750-2500rpm' '10.4@ 3,200(kgm@ rpm)' '128Nm@ 3100rpm' '102Nm@ 2600rpm' '131Nm@ 4400rpm' '11.4@ 4,000(kgm@ rpm)' '250Nm@ 4250rpm' '343Nm@ 1600-2800rpm' '185Nm@ 1750-2750rpm' '12@ 2500(kgm@ rpm)' '12.4@ 2,600(kgm@ rpm)' '170Nm@ 4200rpm' '176Nm@ 1500rpm' '380NM@ 1800-2800rpm' '250Nm@ 1600-2000rpm' '24.5@ 3,500-4,500(kgm@ rpm)' '22.9@ 1,950-4,700(kgm@ rpm)' '113Nm@ 4400rpm' '121Nm@ 2800rpm' '210 / 1900' '250Nm@ 1250-5000rpm' '400Nm@ 175-2750rpm' '350Nm@ 1500-3500rpm' '175nm@ 1750-4000rpm' '115@ 2500(kgm@ rpm)' '110Nm@ 4500rpm' '190Nm@ 2000-3000rpm' '106Nm@ 2200rpm' '21.4@ 1,750-4,600(kgm@ rpm)' '96Nm@ 3000rpm' '23.6@ 4,250(kgm@ rpm)' '11.3kgm@ 4700rpm' '450Nm@ 1750-2500rpm' '35.7@ 1,750-3,000(kgm@ rpm)' '6@ 2,500(kgm@ rpm)' nan '13.9 kgm at 4200 rpm' '320Nm@ 1400-4100rpm' '150Nm@ 1700-4500rpm' '113.8Nm@ 4000rpm' '110@ 3,000(kgm@ rpm)' '151Nm@ 2400rpm' '62Nm@ 2500rpm' '18@ 1,600-2,200(kgm@ rpm)' '83Nm@ 3000rpm' '124.5Nm@ 3500rpm' '20@ 4,700(kgm@ rpm)' '300Nm@ 1600-4000rpm' '171.6Nm@ 1500-4000rpm' '21.4@ 1,900(kgm@ rpm)' '190@ 21,800(kgm@ rpm)' '5.7@ 2,500(kgm@ rpm)' '88.4Nm@ 4200rpm' '250 Nm at 1,500-3,000 rpm' '340nm@ 1750-3000rpm' '36.6@ 1,750-2,500(kgm@ rpm)' '12.5kgm@ 3500rpm' '6.1@ 3,000(kgm@ rpm)' '110Nm@ 4000rpm' '350nm@ 1800-2600rpm' '175nm@ 1500-4100rpm' '4.8kgm@ 3000rpm' '355Nm@ 4500rpm' '51@ 1,750-3,000(kgm@ rpm)' '119Nm@ 4250rpm' '410Nm@ 1600-2800rpm' '174Nm@ 4300rpm' '99.1Nm@ 4500rpm' '385Nm@ 1600-2500rpm' '180 Nm at 2000rpm' '190 Nm at 1750 rpm' '53@ 2,000-2,750(kgm@ rpm)' '360Nm@ 1400-2600rpm' '420Nm@ 2000rpm' '124Nm@ 3500rpm' '17.5@ 4,300(kgm@ rpm)' '360Nm@ 2000rpm' '145Nm@ 3750rpm' '85Nm@ 3500rpm' '190Nm@ 4200rpm' '190 Nm at 2000rpm' '13.5@ 2500(kgm@ rpm)' '250nm@ 1500-3000rpm' '159.8Nm@ 1500-2750rpm' '500Nm@ 2000rpm' '333Nm@ 1600-3200rpm' '400nm@ 2800rpm' '33@ 2,000-2,680(kgm@ rpm)' '10.2@ 2,600(kgm@ rpm)' '480Nm' '190Nm@ 4300rpm' '320Nm@ 1800-2800rpm' '380Nm@ 1750rpm' '250.06nm@ 1500-2750rpm' '90nm@ 3500rpm' '190Nm@ 3700rpm' '436.4Nm@ 1800-2500rpm' '96 Nm at 3000 rpm '] count 7906 unique 441 top 190Nm@ 2000rpm freq 530 Name: torque, dtype: object
In [13]:
def to_float_num(x):
if pd.isna(x):
return np.nan
s = str(x)
m = re.search(r"(\d+(\.\d+)?)", s)
return float(m.group(1)) if m else np.nan
def parse_torque(x):
if pd.isna(x):
return (np.nan, np.nan)
s = str(x).lower().replace(" ", "")
# torque number
torque_nm = None
m_nm = re.search(r"(\d+(\.\d+)?)\s*nm", s)
if m_nm:
torque_nm = float(m_nm.group(1))
else:
# sometimes no 'nm' unit but still has number
m_any = re.search(r"(\d+(\.\d+)?)", s)
torque_nm = float(m_any.group(1)) if m_any else np.nan
# rpm after @
torque_rpm = np.nan
m_rpm = re.search(r"@(\d+(\.\d+)?)", s)
if m_rpm:
torque_rpm = float(m_rpm.group(1))
else:
m_rpm2 = re.search(r"(\d+(\.\d+)?)rpm", s)
if m_rpm2:
torque_rpm = float(m_rpm2.group(1))
return torque_nm, torque_rpm
def clean_car_df(df):
df = df.copy()
# Strip column names
df.columns = [c.strip() for c in df.columns]
# Standardize some object columns
for col in ["name", "fuel", "seller_type", "transmission", "owner"]:
if col in df.columns:
df[col] = df[col].astype(str).str.strip()
df[col] = df[col].replace({"nan": np.nan, "None": np.nan, "": np.nan})
# Convert year, seats to numeric
for col in ["year", "seats"]:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
# selling_price, km_driven to numeric
for col in ["selling_price", "km_driven"]:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
# mileage: e.g., "17.0 kmpl", "23.4 km/kg"
if "mileage" in df.columns:
df["mileage_num"] = df["mileage"].apply(to_float_num)
# engine: e.g., "1248 CC"
if "engine" in df.columns:
df["engine_cc"] = df["engine"].apply(to_float_num)
# max_power: e.g., "74 bhp"
if "max_power" in df.columns:
df["max_power_bhp"] = df["max_power"].apply(to_float_num)
# torque: create torque_nm and torque_rpm
if "torque" in df.columns:
torque_parsed = df["torque"].apply(parse_torque)
df["torque_nm"] = torque_parsed.apply(lambda t: t[0])
df["torque_rpm"] = torque_parsed.apply(lambda t: t[1])
return df
df = clean_car_df(df)
display(df.head())
| name | year | selling_price | km_driven | fuel | seller_type | transmission | owner | mileage | engine | max_power | torque | seats | mileage_num | engine_cc | max_power_bhp | torque_nm | torque_rpm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Maruti Swift Dzire VDI | 2014 | 450000 | 145500 | Diesel | Individual | Manual | First Owner | 23.4 kmpl | 1248 CC | 74 bhp | 190Nm@ 2000rpm | 5.0 | 23.40 | 1248.0 | 74.00 | 190.0 | 2000.0 |
| 1 | Skoda Rapid 1.5 TDI Ambition | 2014 | 370000 | 120000 | Diesel | Individual | Manual | Second Owner | 21.14 kmpl | 1498 CC | 103.52 bhp | 250Nm@ 1500-2500rpm | 5.0 | 21.14 | 1498.0 | 103.52 | 250.0 | 1500.0 |
| 2 | Honda City 2017-2020 EXi | 2006 | 158000 | 140000 | Petrol | Individual | Manual | Third Owner | 17.7 kmpl | 1497 CC | 78 bhp | 12.7@ 2,700(kgm@ rpm) | 5.0 | 17.70 | 1497.0 | 78.00 | 12.7 | 2.0 |
| 3 | Hyundai i20 Sportz Diesel | 2010 | 225000 | 127000 | Diesel | Individual | Manual | First Owner | 23.0 kmpl | 1396 CC | 90 bhp | 22.4 kgm at 1750-2750rpm | 5.0 | 23.00 | 1396.0 | 90.00 | 22.4 | 2750.0 |
| 4 | Maruti Swift VXI BSIII | 2007 | 130000 | 120000 | Petrol | Individual | Manual | First Owner | 16.1 kmpl | 1298 CC | 88.2 bhp | 11.5@ 4,500(kgm@ rpm) | 5.0 | 16.10 | 1298.0 | 88.20 | 11.5 | 4.0 |
In [14]:
df["torque_nm"].fillna(df["torque_nm"].median(), inplace=True)
df["torque_rpm"].fillna(df["torque_rpm"].median(), inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\3206175945.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df["torque_nm"].fillna(df["torque_nm"].median(), inplace=True)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\3206175945.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df["torque_rpm"].fillna(df["torque_rpm"].median(), inplace=True)
In [15]:
df = df.copy()
df.drop(columns=["mileage", "engine", "max_power", "torque"], inplace=True, errors="ignore")
In [16]:
def missing_values_analysis(df):
missing = pd.DataFrame({
'Column': df.columns,
'Missing_Count': df.isnull().sum(),
'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
'Data_Type': df.dtypes
})
missing = missing[missing['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
print("Missing Values Summary:")
if len(missing) > 0:
print(missing)
else:
print("No missing values found")
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing data: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100:.2f}%")
In [17]:
missing_values_analysis(df)
Missing Values Summary:
Column Missing_Count Missing_Percentage Data_Type
max_power_bhp max_power_bhp 1 0.012647 float64
Total missing values: 1
Percentage of missing data: 0.00%
In [18]:
def duplicate_analysis(df):
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
print(f"Percentage of duplicates: {(duplicates / len(df)) * 100:.2f}%")
if duplicates > 0:
print("\nDuplicate Rows:")
print(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)))
In [19]:
duplicate_analysis(df)
Number of duplicate rows: 1189
Percentage of duplicates: 15.04%
Duplicate Rows:
name year selling_price km_driven \
1977 Audi Q3 2.0 TDI Quattro Premium Plus 2017 2825000 22000
7324 Audi Q3 2.0 TDI Quattro Premium Plus 2017 2825000 22000
2129 Audi Q5 3.0 TDI Quattro 2014 1850000 76131
7775 Audi Q5 3.0 TDI Quattro 2014 1850000 76131
131 Audi Q5 35TDI Premium Plus 2018 3975000 31800
... ... ... ... ...
6693 Volvo XC40 D4 Inscription BSIV 2019 3800000 20000
6963 Volvo XC40 D4 Inscription BSIV 2019 3800000 20000
7862 Volvo XC40 D4 Inscription BSIV 2019 3800000 20000
145 Volvo XC40 D4 R-Design 2018 3400000 22000
3251 Volvo XC40 D4 R-Design 2018 3400000 22000
fuel seller_type transmission owner seats mileage_num \
1977 Diesel Dealer Automatic First Owner 5.0 15.73
7324 Diesel Dealer Automatic First Owner 5.0 15.73
2129 Diesel Individual Automatic First Owner 5.0 13.22
7775 Diesel Individual Automatic First Owner 5.0 13.22
131 Diesel Dealer Automatic First Owner 5.0 17.01
... ... ... ... ... ... ...
6693 Diesel Individual Automatic First Owner 5.0 18.00
6963 Diesel Individual Automatic First Owner 5.0 18.00
7862 Diesel Individual Automatic First Owner 5.0 18.00
145 Diesel Dealer Automatic First Owner 5.0 18.00
3251 Diesel Dealer Automatic First Owner 5.0 18.00
engine_cc max_power_bhp torque_nm torque_rpm
1977 1968.0 174.33 380.0 1750.0
7324 1968.0 174.33 380.0 1750.0
2129 2967.0 241.40 580.0 1400.0
7775 2967.0 241.40 580.0 1400.0
131 1968.0 188.00 400.0 1750.0
... ... ... ... ...
6693 1969.0 190.00 400.0 2000.0
6963 1969.0 190.00 400.0 2000.0
7862 1969.0 190.00 400.0 2000.0
145 1969.0 190.00 400.0 2000.0
3251 1969.0 190.00 400.0 2000.0
[1801 rows x 14 columns]
In [20]:
def unique_values_analysis(df):
unique_info = pd.DataFrame({
'Column': df.columns,
'Unique_Count': [df[col].nunique() for col in df.columns],
'Unique_Percentage': [(df[col].nunique() / len(df)) * 100 for col in df.columns],
'Sample_Values': [df[col].unique()[:5].tolist() for col in df.columns]
})
print("\n", unique_info)
# Detailed view for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("\n" + "-"*80)
print("Categorical Columns - Value Counts:")
for col in categorical_cols:
print(f"\n{col}:")
print(df[col].value_counts())
In [21]:
unique_values_analysis(df)
Column Unique_Count Unique_Percentage \
0 name 1983 25.079044
1 year 27 0.341470
2 selling_price 670 8.473504
3 km_driven 898 11.357025
4 fuel 4 0.050588
5 seller_type 3 0.037941
6 transmission 2 0.025294
7 owner 5 0.063235
8 seats 9 0.113823
9 mileage_num 381 4.818515
10 engine_cc 121 1.530290
11 max_power_bhp 318 4.021753
12 torque_nm 244 3.085873
13 torque_rpm 56 0.708233
Sample_Values
0 [Maruti Swift Dzire VDI, Skoda Rapid 1.5 TDI A...
1 [2014, 2006, 2010, 2007, 2017]
2 [450000, 370000, 158000, 225000, 130000]
3 [145500, 120000, 140000, 127000, 45000]
4 [Diesel, Petrol, LPG, CNG]
5 [Individual, Dealer, Trustmark Dealer]
6 [Manual, Automatic]
7 [First Owner, Second Owner, Third Owner, Fourt...
8 [5.0, 4.0, 7.0, 8.0, 6.0]
9 [23.4, 21.14, 17.7, 23.0, 16.1]
10 [1248.0, 1498.0, 1497.0, 1396.0, 1298.0]
11 [74.0, 103.52, 78.0, 90.0, 88.2]
12 [190.0, 250.0, 12.7, 22.4, 11.5]
13 [2000.0, 1500.0, 2.0, 2750.0, 4.0]
--------------------------------------------------------------------------------
Categorical Columns - Value Counts:
name:
name
Maruti Swift Dzire VDI 129
Maruti Alto 800 LXI 82
Maruti Alto LXi 71
BMW X4 M Sport X xDrive20d 62
Maruti Swift VDI 61
...
Honda Jazz 1.2 S AT i VTEC 1
Hyundai i20 Active 1.2 SX with AVN 1
Ambassador CLASSIC 1500 DSL AC 1
Tata Harrier XZ Dark Edition BSIV 1
Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV 1
Name: count, Length: 1983, dtype: int64
fuel:
fuel
Diesel 4299
Petrol 3520
CNG 53
LPG 35
Name: count, dtype: int64
seller_type:
seller_type
Individual 6564
Dealer 1107
Trustmark Dealer 236
Name: count, dtype: int64
transmission:
transmission
Manual 6866
Automatic 1041
Name: count, dtype: int64
owner:
owner
First Owner 5215
Second Owner 2017
Third Owner 510
Fourth & Above Owner 160
Test Drive Car 5
Name: count, dtype: int64
In [22]:
def distribution_analysis(df):
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Distribution metrics
print("\nDistribution Metrics:")
for col in numerical_cols:
print(f"\n{col}:")
print(f" Skewness: {skew(df[col].dropna()):.4f}")
print(f" Kurtosis: {kurtosis(df[col].dropna()):.4f}")
# Normality test
if len(df[col].dropna()) >= 3:
stat, p_value = stats.shapiro(df[col].dropna().sample(min(5000, len(df[col].dropna()))))
print(f" Shapiro-Wilk test p-value: {p_value:.4f}")
print(f" Normal distribution: {'Yes' if p_value > 0.05 else 'No'}")
# Histograms
n_cols = len(numerical_cols)
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
axes = axes.flatten() if n_cols > 1 else [axes]
for idx, col in enumerate(numerical_cols):
axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[idx].set_title(f'Distribution of {col}')
axes[idx].set_xlabel(col)
axes[idx].set_ylabel('Frequency')
axes[idx].grid(True, alpha=0.3)
# Hide empty subplots
for idx in range(len(numerical_cols), len(axes)):
axes[idx].axis('off')
plt.tight_layout()
plt.show()
# Box plots
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
axes = axes.flatten() if n_cols > 1 else [axes]
for idx, col in enumerate(numerical_cols):
axes[idx].boxplot(df[col].dropna())
axes[idx].set_title(f'Box Plot of {col}')
axes[idx].set_ylabel(col)
axes[idx].grid(True, alpha=0.3)
for idx in range(len(numerical_cols), len(axes)):
axes[idx].axis('off')
plt.tight_layout()
plt.show()
distribution_analysis(df)
Distribution Metrics: year: Skewness: -0.9970 Kurtosis: 1.2795 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No selling_price: Skewness: 4.1562 Kurtosis: 20.6221 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No km_driven: Skewness: 11.3406 Kurtosis: 388.4493 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No seats: Skewness: 1.9695 Kurtosis: 3.7963 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No mileage_num: Skewness: -0.1426 Kurtosis: 0.5687 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No engine_cc: Skewness: 1.1351 Kurtosis: 0.7313 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No max_power_bhp: Skewness: 1.6394 Kurtosis: 3.7985 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No torque_nm: Skewness: 1.0602 Kurtosis: 1.9373 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No torque_rpm: Skewness: 0.1391 Kurtosis: -0.9095 Shapiro-Wilk test p-value: 0.0000 Normal distribution: No
Selling Price¶
- Highly right-skewed.
- Majority of vehicles are in lower price ranges.
- Few luxury cars create extreme high-price outliers.
Implication: Log transformation may improve model performance.
Kilometers Driven¶
- Distribution shows most vehicles in mid-range mileage.
- Long tail exists for high-mileage vehicles.
Implication: Higher mileage often lowers price.
Engine Capacity & Power¶
- Engine sizes cluster around common passenger vehicle ranges.
- Power distribution shows majority of cars in moderate power range.
- High-performance vehicles form outliers.
Implication: Engine size and power strongly influence price.
Mileage Efficiency¶
- Mileage values vary across fuel and engine types.
Implication: Higher mileage efficiency may influence resale value but not as strongly as engine power.
In [23]:
def outlier_detection(df):
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("\nIQR Method (Outliers beyond 1.5 * IQR):")
for col in numerical_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
outlier_count = len(outliers)
outlier_percentage = (outlier_count / len(df)) * 100
print(f"\n{col}:")
print(f" Lower bound: {lower_bound:.2f}")
print(f" Upper bound: {upper_bound:.2f}")
print(f" Outliers count: {outlier_count}")
print(f" Outliers percentage: {outlier_percentage:.2f}%")
print("Z-Score Method (|Z-score| > 3):")
for col in numerical_cols:
z_scores = np.abs(stats.zscore(df[col].dropna()))
outliers = np.sum(z_scores > 3)
outlier_percentage = (outliers / len(df[col].dropna())) * 100
print(f"\n{col}:")
print(f" Outliers count: {outliers}")
print(f" Outliers percentage: {outlier_percentage:.2f}%")
In [24]:
outlier_detection(df)
IQR Method (Outliers beyond 1.5 * IQR): year: Lower bound: 2004.50 Upper bound: 2024.50 Outliers count: 166 Outliers percentage: 2.10% selling_price: Lower bound: -360000.00 Upper bound: 1320000.00 Outliers count: 598 Outliers percentage: 7.56% km_driven: Lower bound: -56125.00 Upper bound: 186875.00 Outliers count: 193 Outliers percentage: 2.44% seats: Lower bound: 5.00 Upper bound: 5.00 Outliers count: 1653 Outliers percentage: 20.91% mileage_num: Lower bound: 8.47 Upper bound: 30.63 Outliers count: 24 Outliers percentage: 0.30% engine_cc: Lower bound: 619.50 Upper bound: 2159.50 Outliers count: 1183 Outliers percentage: 14.96% max_power_bhp: Lower bound: 17.12 Upper bound: 152.93 Outliers count: 564 Outliers percentage: 7.13% torque_nm: Lower bound: -50.50 Upper bound: 353.50 Outliers count: 404 Outliers percentage: 5.11% torque_rpm: Lower bound: -1325.00 Upper bound: 6875.00 Outliers count: 0 Outliers percentage: 0.00% Z-Score Method (|Z-score| > 3): year: Outliers count: 78 Outliers percentage: 0.99% selling_price: Outliers count: 229 Outliers percentage: 2.90% km_driven: Outliers count: 70 Outliers percentage: 0.89% seats: Outliers count: 102 Outliers percentage: 1.29% mileage_num: Outliers count: 24 Outliers percentage: 0.30% engine_cc: Outliers count: 78 Outliers percentage: 0.99% max_power_bhp: Outliers count: 90 Outliers percentage: 1.14% torque_nm: Outliers count: 53 Outliers percentage: 0.67% torque_rpm: Outliers count: 0 Outliers percentage: 0.00%
In [25]:
def correlation_analysis(df):
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Pearson correlation
print("\nPearson Correlation Matrix:")
corr_matrix = df[numerical_cols].corr()
print(corr_matrix)
# Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap (Pearson)')
plt.tight_layout()
plt.show()
# High correlations
print("High Correlations (|r| > 0.7):")
high_corr = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
if abs(corr_matrix.iloc[i, j]) > 0.7:
high_corr.append({
'Feature 1': corr_matrix.columns[i],
'Feature 2': corr_matrix.columns[j],
'Correlation': corr_matrix.iloc[i, j]
})
if high_corr:
print(pd.DataFrame(high_corr))
else:
print("No high correlations found.")
# Spearman correlation
print("Spearman Correlation Matrix:")
spearman_corr = df[numerical_cols].corr(method='spearman')
print(spearman_corr)
In [26]:
correlation_analysis(df)
Pearson Correlation Matrix:
year selling_price km_driven seats mileage_num \
year 1.000000 0.412268 -0.428434 -0.009144 0.329145
selling_price 0.412268 1.000000 -0.222196 0.041358 -0.126054
km_driven -0.428434 -0.222196 1.000000 0.227336 -0.173073
seats -0.009144 0.041358 0.227336 1.000000 -0.452085
mileage_num 0.329145 -0.126054 -0.173073 -0.452085 1.000000
engine_cc 0.018848 0.455734 0.205914 0.610309 -0.575831
max_power_bhp 0.226598 0.749674 -0.038159 0.191999 -0.374621
torque_nm 0.289279 0.619784 -0.003332 0.274297 -0.171046
torque_rpm 0.157434 -0.132285 -0.275886 -0.315482 0.092260
engine_cc max_power_bhp torque_nm torque_rpm
year 0.018848 0.226598 0.289279 0.157434
selling_price 0.455734 0.749674 0.619784 -0.132285
km_driven 0.205914 -0.038159 -0.003332 -0.275886
seats 0.610309 0.191999 0.274297 -0.315482
mileage_num -0.575831 -0.374621 -0.171046 0.092260
engine_cc 1.000000 0.703975 0.628822 -0.429501
max_power_bhp 0.703975 1.000000 0.770206 -0.216084
torque_nm 0.628822 0.770206 1.000000 -0.371573
torque_rpm -0.429501 -0.216084 -0.371573 1.000000
High Correlations (|r| > 0.7):
Feature 1 Feature 2 Correlation
0 selling_price max_power_bhp 0.749674
1 engine_cc max_power_bhp 0.703975
2 max_power_bhp torque_nm 0.770206
Spearman Correlation Matrix:
year selling_price km_driven seats mileage_num \
year 1.000000 0.699282 -0.622829 -0.010459 0.304563
selling_price 0.699282 1.000000 -0.348455 0.263143 -0.029221
km_driven -0.622829 -0.348455 1.000000 0.228042 -0.163102
seats -0.010459 0.263143 0.228042 1.000000 -0.439428
mileage_num 0.304563 -0.029221 -0.163102 -0.439428 1.000000
engine_cc -0.002641 0.513526 0.241342 0.519604 -0.462244
max_power_bhp 0.221005 0.671143 -0.026260 0.283206 -0.345271
torque_nm 0.273297 0.654343 0.058245 0.346184 -0.085568
torque_rpm 0.113698 -0.218231 -0.323879 -0.401392 0.150710
engine_cc max_power_bhp torque_nm torque_rpm
year -0.002641 0.221005 0.273297 0.113698
selling_price 0.513526 0.671143 0.654343 -0.218231
km_driven 0.241342 -0.026260 0.058245 -0.323879
seats 0.519604 0.283206 0.346184 -0.401392
mileage_num -0.462244 -0.345271 -0.085568 0.150710
engine_cc 1.000000 0.740780 0.708355 -0.546173
max_power_bhp 0.740780 1.000000 0.704594 -0.285792
torque_nm 0.708355 0.704594 1.000000 -0.466410
torque_rpm -0.546173 -0.285792 -0.466410 1.000000
In [27]:
def categorical_analysis(df):
new_category_cols = ['owner', 'fuel', 'seller_type', 'transmission']
# keep only columns that exist in df
categorical_cols = [c for c in new_category_cols if c in df.columns]
for col in categorical_cols:
plt.figure(figsize=(10, 6))
df[col].value_counts().plot(kind='bar', edgecolor='black', alpha=0.7)
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [28]:
categorical_analysis(df)
Fuel Type¶
- Diesel and Petrol vehicles dominate the dataset.
- Alternative fuels appear less frequently.
- Price patterns vary slightly by fuel type.
Transmission¶
- Manual vehicles are more common.
- Automatic vehicles often command higher prices.
Ownership¶
- First-owner cars typically have higher resale value.
- Cars with multiple owners tend to sell cheaper.
Seller Type¶
- Dealer listings sometimes show higher prices compared to individuals.
In [29]:
def bivariate_analysis(df, target_col='selling_price'):
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
new_category_cols = ['owner', 'fuel', 'seller_type', 'transmission']
# keep only columns that exist in df
categorical_cols = [c for c in new_category_cols if c in df.columns]
# Numerical vs Target
print(f"\nCorrelation with {target_col}:")
if target_col in numerical_cols:
correlations = df[numerical_cols].corrwith(df[target_col]).sort_values(ascending=False)
print(correlations)
# Scatter plots
other_numerical = [col for col in numerical_cols if col != target_col]
n_cols = len(other_numerical)
n_rows = (n_cols + 2) // 3
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
axes = axes.flatten() if n_cols > 1 else [axes]
for idx, col in enumerate(other_numerical):
axes[idx].scatter(df[col], df[target_col], alpha=0.5)
axes[idx].set_xlabel(col)
axes[idx].set_ylabel(target_col)
axes[idx].set_title(f'{col} vs {target_col}')
axes[idx].grid(True, alpha=0.3)
for idx in range(len(other_numerical), len(axes)):
axes[idx].axis('off')
plt.tight_layout()
plt.show()
# Categorical vs Target
print(f"Categorical features vs {target_col}:")
for col in categorical_cols:
print(f"\n{col}:")
group_stats = df.groupby(col)[target_col].agg(['mean', 'median', 'std', 'count'])
print(group_stats)
# Box plot
plt.figure(figsize=(12, 6))
df.boxplot(column=target_col, by=col, figsize=(12, 6))
plt.title(f'{target_col} by {col}')
plt.suptitle('')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
In [30]:
bivariate_analysis(df)
Correlation with selling_price: selling_price 1.000000 max_power_bhp 0.749674 torque_nm 0.619784 engine_cc 0.455734 year 0.412268 seats 0.041358 mileage_num -0.126054 torque_rpm -0.132285 km_driven -0.222196 dtype: float64
Categorical features vs selling_price:
owner:
mean median std count
owner
First Owner 7.899799e+05 550000.0 9.362189e+05 5215
Fourth & Above Owner 2.331968e+05 185000.0 1.586916e+05 160
Second Owner 4.010442e+05 320000.0 3.273773e+05 2017
Test Drive Car 4.403800e+06 5923000.0 2.510598e+06 5
Third Owner 2.931873e+05 230499.5 2.303804e+05 510
<Figure size 1200x600 with 0 Axes>
fuel:
mean median std count
fuel
CNG 313415.037736 330000.0 105577.722776 53
Diesel 802450.282624 550000.0 915753.684639 4299
LPG 210885.714286 200000.0 79828.039976 35
Petrol 472665.203125 350000.0 631712.877861 3520
<Figure size 1200x600 with 0 Axes>
seller_type:
mean median std count
seller_type
Dealer 1.459910e+06 700000.0 1.576638e+06 1107
Individual 5.076407e+05 400000.0 4.872636e+05 6564
Trustmark Dealer 8.018390e+05 650000.0 5.080779e+05 236
<Figure size 1200x600 with 0 Axes>
transmission:
mean median std count
transmission
Automatic 1.882972e+06 1100000.0 1.641287e+06 1041
Manual 4.627634e+05 409999.0 2.975859e+05 6866
<Figure size 1200x600 with 0 Axes>
Key trends:
- Selling price decreases as kilometers driven increases.
- Price increases with engine size and power.
- Newer vehicles have significantly higher prices.
- Automatic transmission vehicles often cost more.
In [31]:
def pairplot_analysis(df, sample_size=2000):
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Sample data if too large
if len(df) > sample_size:
df_sample = df[numerical_cols].sample(sample_size)
print(f"\nUsing sample of {sample_size} rows for pair plot")
else:
df_sample = df[numerical_cols]
print("\nGenerating pair plot...")
sns.pairplot(df_sample, diag_kind='kde', plot_kws={'alpha': 0.6})
plt.tight_layout()
plt.show()
In [32]:
pairplot_analysis(df)
Using sample of 2000 rows for pair plot Generating pair plot...
In [33]:
def time_series_analysis(df, time_col='year'):
if time_col in df.columns:
print(f"\nAnalyzing trends by {time_col}:")
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col != time_col]
for col in numerical_cols:
yearly_stats = df.groupby(time_col)[col].agg(['mean', 'median', 'count'])
print(f"\n{col} by {time_col}:")
print(yearly_stats)
# Line plot
plt.figure(figsize=(12, 6))
yearly_stats['mean'].plot(marker='o', linewidth=2)
plt.title(f'Average {col} by {time_col}')
plt.xlabel(time_col)
plt.ylabel(f'Average {col}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
In [34]:
time_series_analysis(df)
Analyzing trends by year:
selling_price by year:
mean median count
year
1994 7.200000e+04 72000.0 2
1995 5.500000e+04 55000.0 1
1996 4.100000e+04 41000.0 2
1997 8.611100e+04 50000.0 9
1998 5.788889e+04 58000.0 9
1999 7.692857e+04 62500.0 14
2000 7.668212e+04 77500.0 16
2001 4.650000e+04 45000.0 6
2002 1.037894e+05 75000.0 19
2003 1.015729e+05 85000.0 37
2004 1.094089e+05 90000.0 51
2005 1.437566e+05 110000.0 76
2006 1.704622e+05 140000.0 102
2007 1.791065e+05 142000.0 173
2008 2.055598e+05 170000.0 201
2009 2.288095e+05 180000.0 231
2010 2.740106e+05 220000.0 375
2011 3.220964e+05 265000.0 570
2012 3.531433e+05 300000.0 621
2013 4.606646e+05 389500.0 668
2014 5.166757e+05 450500.0 620
2015 5.967703e+05 525000.0 775
2016 7.003177e+05 550000.0 856
2017 8.901930e+05 645000.0 1010
2018 9.581836e+05 715000.0 806
2019 1.776986e+06 779000.0 583
2020 8.852702e+05 590000.0 74
km_driven by year:
mean median count
year
1994 80000.000000 80000.0 2
1995 70000.000000 70000.0 1
1996 51000.000000 51000.0 2
1997 99487.555556 100000.0 9
1998 71111.111111 70000.0 9
1999 89094.285714 75000.0 14
2000 106821.562500 90000.0 16
2001 48184.666667 38554.0 6
2002 94492.894737 80000.0 19
2003 95697.081081 90000.0 37
2004 98883.274510 96000.0 51
2005 104011.894737 90000.0 76
2006 108693.568627 100000.0 102
2007 115765.884393 100000.0 173
2008 99125.134328 100000.0 201
2009 98978.813853 90000.0 231
2010 102048.992000 91000.0 375
2011 96860.884211 90000.0 570
2012 94988.974235 90000.0 621
2013 85522.365269 79328.0 668
2014 78887.977419 72000.0 620
2015 71672.080000 63063.0 775
2016 58877.210280 56494.0 856
2017 46525.788119 40000.0 1010
2018 33916.936725 30000.0 806
2019 16307.900515 10500.0 583
2020 20525.716216 7350.0 74
seats by year:
mean median count
year
1994 4.500000 4.5 2
1995 4.000000 4.0 1
1996 4.000000 4.0 2
1997 4.666667 4.0 9
1998 4.555556 5.0 9
1999 4.500000 4.5 14
2000 5.187500 5.0 16
2001 4.333333 4.0 6
2002 5.210526 5.0 19
2003 5.081081 5.0 37
2004 4.862745 5.0 51
2005 5.394737 5.0 76
2006 5.431373 5.0 102
2007 5.381503 5.0 173
2008 5.492537 5.0 201
2009 5.367965 5.0 231
2010 5.416000 5.0 375
2011 5.445614 5.0 570
2012 5.431562 5.0 621
2013 5.600299 5.0 668
2014 5.601613 5.0 620
2015 5.598710 5.0 775
2016 5.463785 5.0 856
2017 5.312871 5.0 1010
2018 5.277916 5.0 806
2019 5.204117 5.0 583
2020 5.243243 5.0 74
mileage_num by year:
mean median count
year
1994 14.450000 14.45 2
1995 16.100000 16.10 1
1996 16.100000 16.10 2
1997 16.066667 16.10 9
1998 15.933333 16.10 9
1999 15.964286 16.10 14
2000 16.368750 16.10 16
2001 16.700000 16.10 6
2002 16.327895 16.10 19
2003 15.778378 16.10 37
2004 15.952941 16.10 51
2005 15.943421 16.10 76
2006 16.174020 16.80 102
2007 16.614277 17.20 173
2008 16.542637 17.50 201
2009 17.011126 17.80 231
2010 17.504453 18.00 375
2011 18.026281 18.60 570
2012 19.102206 19.40 621
2013 19.296243 20.36 668
2014 19.722645 20.36 620
2015 20.136452 20.45 775
2016 20.293014 20.14 856
2017 20.864713 21.04 1010
2018 21.024715 21.13 806
2019 19.918491 19.50 583
2020 19.833108 20.89 74
engine_cc by year:
mean median count
year
1994 1395.500000 1395.5 2
1995 796.000000 796.0 1
1996 796.000000 796.0 2
1997 929.444444 796.0 9
1998 950.777778 796.0 9
1999 1153.428571 894.5 14
2000 1011.062500 796.0 16
2001 828.833333 796.0 6
2002 1187.000000 993.0 19
2003 1302.675676 998.0 37
2004 1252.490196 998.0 51
2005 1421.947368 1341.0 76
2006 1456.656863 1405.0 102
2007 1461.387283 1396.0 173
2008 1472.636816 1396.0 201
2009 1420.930736 1248.0 231
2010 1436.133333 1248.0 375
2011 1531.138596 1397.5 570
2012 1465.280193 1396.0 621
2013 1528.327844 1392.0 668
2014 1517.109677 1364.0 620
2015 1494.689032 1373.0 775
2016 1434.808411 1248.0 856
2017 1419.234653 1248.0 1010
2018 1404.681141 1248.0 806
2019 1492.324185 1199.0 583
2020 1360.054054 1197.0 74
max_power_bhp by year:
mean median count
year
1994 44.500000 44.500 2
1995 37.000000 37.000 1
1996 37.000000 37.000 2
1997 49.666667 37.000 9
1998 48.888889 37.000 9
1999 53.785714 48.500 14
2000 48.093333 46.300 15
2001 43.500000 37.000 6
2002 58.515789 60.000 19
2003 68.829730 66.000 37
2004 70.956863 62.000 51
2005 77.257895 69.000 76
2006 80.700980 74.000 102
2007 78.251387 67.000 173
2008 81.149950 70.000 201
2009 83.308139 73.900 231
2010 85.013573 75.000 375
2011 87.883719 75.000 570
2012 86.536812 78.000 621
2013 88.651108 80.900 668
2014 89.951871 82.000 620
2015 92.668852 88.500 775
2016 92.549562 88.500 856
2017 97.277218 86.795 1010
2018 95.964529 84.000 806
2019 114.106930 88.700 583
2020 93.640135 81.860 74
torque_nm by year:
mean median count
year
1994 82.500000 82.50 2
1995 59.000000 59.00 1
1996 59.000000 59.00 2
1997 71.666667 59.00 9
1998 66.344444 59.00 9
1999 59.035714 59.00 14
2000 64.456250 62.00 16
2001 64.166667 59.00 6
2002 115.715789 62.00 19
2003 115.908108 78.00 37
2004 74.284314 59.00 51
2005 81.931579 62.00 76
2006 74.576471 59.00 102
2007 83.514335 62.00 173
2008 95.944179 84.00 201
2009 120.586667 99.04 231
2010 127.309013 110.00 375
2011 156.955544 145.00 570
2012 166.619163 160.00 621
2013 182.045554 190.00 668
2014 186.246903 190.00 620
2015 186.209923 190.00 775
2016 174.057056 173.75 856
2017 192.094980 190.00 1010
2018 184.826911 170.00 806
2019 197.050137 142.00 583
2020 165.353649 113.75 74
torque_rpm by year:
mean median count
year
1994 2350.000000 2350.0 2
1995 2500.000000 2500.0 1
1996 2500.000000 2500.0 2
1997 3166.666667 2500.0 9
1998 3255.555556 2800.0 9
1999 2536.357143 2500.0 14
2000 2765.812500 2500.0 16
2001 3000.000000 2500.0 6
2002 3044.894737 2500.0 19
2003 2463.864865 2500.0 37
2004 2565.156863 2500.0 51
2005 2454.671053 3000.0 76
2006 1681.941176 1750.0 102
2007 1701.722543 1700.0 173
2008 1838.606965 2000.0 201
2009 2553.571429 2750.0 231
2010 2592.450667 2750.0 375
2011 2454.680702 2000.0 570
2012 2471.716586 2000.0 621
2013 2372.878743 2000.0 668
2014 2448.774194 2000.0 620
2015 2518.033548 1900.0 775
2016 2857.695093 2500.0 856
2017 2638.899010 2000.0 1010
2018 2767.153846 2000.0 806
2019 3092.819897 3500.0 583
2020 3020.270270 3500.0 74
- Average Selling Price by Year
- Overall Trend: Dramatic exponential increase from 2010 onwards
- 1983-2010: Relatively flat, ranging from ~50K to 250K
- 2010-2019: Steady upward climb from 250K to ~1M
- 2019-2020: Massive spike to 1.75M (likely luxury/premium cars entering the market)
- 2020-2021: Sharp drop back to ~900K Interpretation: Recent years show much higher car prices, possibly due to inflation, luxury segment growth, or currency effects
- Average KM Driven by Year
- Highly volatile pattern with significant fluctuations
- Peak at 1991: ~120K km (older cars had more usage)
- 1996 dip: Drops to ~36K km
- 2000-2010: Fluctuates between 60K-115K km
- 2010-2020: Clear downward trend from 100K to ~20K km Interpretation: Newer cars in the dataset have driven fewer kilometers (they're newer/less used)
- Average Seats by Year
- Range: 4-5.6 seats across all years
- 990s: More variation (4-5.2 seats)
- 2005-2015: Stabilized around 5.4-5.6 seats (peak SUV/family car popularity)
- 2015-2020: Slight decline to ~5.2 seats Interpretation: Shift toward larger vehicles (SUVs, MPVs) in 2000s-2010s, then slight return to smaller cars
- Average Mileage (Fuel Efficiency) by Year
- Strong upward trend from 1994 onwards
- 1994: ~14.5 km/L
- 2018: Peak at ~21 km/L (45% improvement!)
- 2018-2021: Slight decline to ~20 km/L Interpretation: Clear technological improvement in fuel efficiency due to better engines, aerodynamics, and hybrid technology
- Average Engine CC by Year
- Highly variable with no clear linear trend
- 1994: 1,400 cc
- 1995-1997: Sharp drop to ~800 cc (small car boom)
- 2005-2013: Rises to 1,400-1,550 cc (larger engines)
- 2020-2021: Drops to ~1,360 cc Interpretation: Market shifts between small economy cars and larger performance vehicles
- Average Max Power (BHP) by Year
- Strong upward trend indicating more powerful engines
- 1994: ~45 BHP
- 2019: Peak at ~114 BHP (2.5x increase!)
- 2000s: Acceleration in power increase begins Interpretation: Consumer demand for performance; turbocharged engines became common
- Average Torque (Nm) by Year
- Dramatic upward trend especially after 2008
- 1994: ~82 Nm
- 2019: ~197 Nm (2.4x increase!)
- 2008-2019: Steepest increase period Interpretation: Better engine technology (turbocharging, diesel engines) providing more low-end power
- Average Torque RPM by Year
- Most volatile graph with major fluctuations
- 1998-1999: Peak at ~3,250 RPM
- 2006-2007: Dramatic drop to ~1,700 RPM (diesel engine adoption?)
- 2019: Returns to ~3,150 RPM Interpretation: Lower RPM = torque available earlier (diesel characteristic); recent rise suggests petrol/hybrid shift
In [35]:
target = df['selling_price']
In [36]:
# ensure target_col is a column name (string)
if isinstance(target, pd.Series):
target_col = target.name
else:
target_col = target
if target_col in df.columns:
# numeric columns excluding target
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns.tolist() if c != target_col]
# categorical columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Target vs numeric features (scatter)
for col in num_cols:
tmp = df[[col, target_col]].dropna()
if tmp.empty:
continue
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(tmp[col], tmp[target_col], alpha=0.4)
ax.set_title(f"{target_col} vs {col}")
ax.set_xlabel(col)
ax.set_ylabel(target_col)
plt.tight_layout()
plt.show()
# Target vs categorical features (box via grouped data)
for col in cat_cols:
# limit to top categories to avoid unreadable plots
top = df[col].value_counts().head(10).index
tmp = df[df[col].isin(top)][[col, target_col]].dropna()
if tmp.empty:
continue
groups = [tmp.loc[tmp[col] == k, target_col].values for k in top]
fig, ax = plt.subplots(figsize=(10, 4))
ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
ax.set_title(f"{target_col} by {col} (top 10 categories)")
ax.set_xlabel(col)
ax.set_ylabel(target_col)
ax.tick_params(axis="x", rotation=45)
plt.tight_layout()
plt.show()
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
C:\Users\Laiba\AppData\Local\Temp\ipykernel_26740\4239846875.py:36: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. ax.boxplot(groups, labels=[str(k) for k in top], showfliers=False)
FEATURE ENGINEERING¶
In [37]:
#The dataset max year is 2020, so we will use that as the current year to calculate the age of the car. We will then drop the year column as it is no longer needed.
current_year = 2020
df["car_age"] = current_year - df["year"]
df.drop(columns=["year"], inplace=True)
In [38]:
# Normalizing the km_driven by the age of the car to get a better sense of how much the car has been driven per year.
# This can help us understand if a car has been driven more than average for its age, which could affect its price.
df["km_per_year"] = df["km_driven"] / (df["car_age"] + 1)
In [39]:
# Engine power to engine size ratio, which can be an indicator of the car's performance.
# A higher ratio may indicate a more powerful engine relative to its size, which could affect the car's price.
df["power_to_engine"] = df["max_power_bhp"] / (df["engine_cc"] + 1)
In [40]:
# Mapping the owner column to numeric values, where "First Owner" is 1, "Second Owner" is 2, and so on.
# This allows us to use the owner information in a numerical format for modeling, while still capturing the ordinal nature of ownership
owner_map = {
"First Owner": 1,
"Second Owner": 2,
"Third Owner": 3,
"Fourth & Above Owner": 4,
"Test Drive Car": 0
}
df["owner_num"] = df["owner"].map(owner_map)
df.drop(columns=["owner"], inplace=True)
In [41]:
# Extracting the brand from the name column,
# which can be a useful feature for modeling as certain brands may have higher resale values than others.
df["brand"] = df["name"].apply(lambda x: x.split()[0])
df.drop(columns=["name"], inplace=True)
In [42]:
df = pd.get_dummies(df, columns=["brand"], drop_first=True)
In [43]:
# Encoding the transmission column as a binary variable, where "Automatic" is 1 and "Manual" is 0.
df["is_automatic"] = (df["transmission"] == "Automatic").astype(int)
df.drop(columns=["transmission"], inplace=True)
In [44]:
# Encoding the fuel column using one-hot encoding, which creates binary columns for each fuel type. We drop the first category to avoid multicollinearity.
df = pd.get_dummies(df, columns=["fuel"], drop_first=True)
In [45]:
# Encoding the seller_type column using one-hot encoding, which creates binary columns for each seller type. We drop the first category to avoid multicollinearity.
df = pd.get_dummies(df, columns=["seller_type"], drop_first=True)
In [46]:
# Log-transforming the selling_price to reduce skewness and make the distribution more normal, which can help improve the performance of linear models.
df["selling_price"] = np.log1p(df["selling_price"])
In [47]:
df.head()
Out[47]:
| selling_price | km_driven | seats | mileage_num | engine_cc | max_power_bhp | torque_nm | torque_rpm | car_age | km_per_year | ... | brand_Tata | brand_Toyota | brand_Volkswagen | brand_Volvo | is_automatic | fuel_Diesel | fuel_LPG | fuel_Petrol | seller_type_Individual | seller_type_Trustmark Dealer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 13.017005 | 145500 | 5.0 | 23.40 | 1248.0 | 74.00 | 190.0 | 2000.0 | 6 | 20785.714286 | ... | False | False | False | False | 0 | True | False | False | True | False |
| 1 | 12.821261 | 120000 | 5.0 | 21.14 | 1498.0 | 103.52 | 250.0 | 1500.0 | 6 | 17142.857143 | ... | False | False | False | False | 0 | True | False | False | True | False |
| 2 | 11.970357 | 140000 | 5.0 | 17.70 | 1497.0 | 78.00 | 12.7 | 2.0 | 14 | 9333.333333 | ... | False | False | False | False | 0 | False | False | True | True | False |
| 3 | 12.323860 | 127000 | 5.0 | 23.00 | 1396.0 | 90.00 | 22.4 | 2750.0 | 10 | 11545.454545 | ... | False | False | False | False | 0 | True | False | False | True | False |
| 4 | 11.775297 | 120000 | 5.0 | 16.10 | 1298.0 | 88.20 | 11.5 | 4.0 | 13 | 8571.428571 | ... | False | False | False | False | 0 | False | False | True | True | False |
5 rows × 48 columns
In [48]:
df = df.drop(columns=["torque_nm", "torque_rpm"])
In [49]:
df.columns
Out[49]:
Index(['selling_price', 'km_driven', 'seats', 'mileage_num', 'engine_cc',
'max_power_bhp', 'car_age', 'km_per_year', 'power_to_engine',
'owner_num', 'brand_Ashok', 'brand_Audi', 'brand_BMW',
'brand_Chevrolet', 'brand_Daewoo', 'brand_Datsun', 'brand_Fiat',
'brand_Force', 'brand_Ford', 'brand_Honda', 'brand_Hyundai',
'brand_Isuzu', 'brand_Jaguar', 'brand_Jeep', 'brand_Kia', 'brand_Land',
'brand_Lexus', 'brand_MG', 'brand_Mahindra', 'brand_Maruti',
'brand_Mercedes-Benz', 'brand_Mitsubishi', 'brand_Nissan', 'brand_Opel',
'brand_Renault', 'brand_Skoda', 'brand_Tata', 'brand_Toyota',
'brand_Volkswagen', 'brand_Volvo', 'is_automatic', 'fuel_Diesel',
'fuel_LPG', 'fuel_Petrol', 'seller_type_Individual',
'seller_type_Trustmark Dealer'],
dtype='object')
In [50]:
df = df.drop(columns=[
"km_driven",
"engine_cc"
])
In [51]:
df.columns
Out[51]:
Index(['selling_price', 'seats', 'mileage_num', 'max_power_bhp', 'car_age',
'km_per_year', 'power_to_engine', 'owner_num', 'brand_Ashok',
'brand_Audi', 'brand_BMW', 'brand_Chevrolet', 'brand_Daewoo',
'brand_Datsun', 'brand_Fiat', 'brand_Force', 'brand_Ford',
'brand_Honda', 'brand_Hyundai', 'brand_Isuzu', 'brand_Jaguar',
'brand_Jeep', 'brand_Kia', 'brand_Land', 'brand_Lexus', 'brand_MG',
'brand_Mahindra', 'brand_Maruti', 'brand_Mercedes-Benz',
'brand_Mitsubishi', 'brand_Nissan', 'brand_Opel', 'brand_Renault',
'brand_Skoda', 'brand_Tata', 'brand_Toyota', 'brand_Volkswagen',
'brand_Volvo', 'is_automatic', 'fuel_Diesel', 'fuel_LPG', 'fuel_Petrol',
'seller_type_Individual', 'seller_type_Trustmark Dealer'],
dtype='object')
In [60]:
df = df.dropna()
df.isna().sum()
Out[60]:
selling_price 0 seats 0 mileage_num 0 max_power_bhp 0 car_age 0 km_per_year 0 power_to_engine 0 owner_num 0 brand_Ashok 0 brand_Audi 0 brand_BMW 0 brand_Chevrolet 0 brand_Daewoo 0 brand_Datsun 0 brand_Fiat 0 brand_Force 0 brand_Ford 0 brand_Honda 0 brand_Hyundai 0 brand_Isuzu 0 brand_Jaguar 0 brand_Jeep 0 brand_Kia 0 brand_Land 0 brand_Lexus 0 brand_MG 0 brand_Mahindra 0 brand_Maruti 0 brand_Mercedes-Benz 0 brand_Mitsubishi 0 brand_Nissan 0 brand_Opel 0 brand_Renault 0 brand_Skoda 0 brand_Tata 0 brand_Toyota 0 brand_Volkswagen 0 brand_Volvo 0 is_automatic 0 fuel_Diesel 0 fuel_LPG 0 fuel_Petrol 0 seller_type_Individual 0 seller_type_Trustmark Dealer 0 dtype: int64
In [61]:
X = df.drop(columns=["selling_price"])
y = df["selling_price"]
In [62]:
print(X)
seats mileage_num max_power_bhp car_age km_per_year \
0 5.0 23.40 74.00 6 20785.714286
1 5.0 21.14 103.52 6 17142.857143
2 5.0 17.70 78.00 14 9333.333333
3 5.0 23.00 90.00 10 11545.454545
4 5.0 16.10 88.20 13 8571.428571
... ... ... ... ... ...
8123 5.0 18.50 82.85 7 13750.000000
8124 5.0 16.80 110.00 13 8500.000000
8125 5.0 19.30 73.90 11 10000.000000
8126 5.0 23.57 70.00 7 3125.000000
8127 5.0 23.57 70.00 7 3125.000000
power_to_engine owner_num brand_Ashok brand_Audi brand_BMW ... \
0 0.059247 1 False False False ...
1 0.069059 2 False False False ...
2 0.052069 3 False False False ...
3 0.064424 1 False False False ...
4 0.067898 1 False False False ...
... ... ... ... ... ... ...
8123 0.069157 1 False False False ...
8124 0.073628 4 False False False ...
8125 0.059167 1 False False False ...
8126 0.050107 1 False False False ...
8127 0.050107 1 False False False ...
brand_Tata brand_Toyota brand_Volkswagen brand_Volvo is_automatic \
0 False False False False 0
1 False False False False 0
2 False False False False 0
3 False False False False 0
4 False False False False 0
... ... ... ... ... ...
8123 False False False False 0
8124 False False False False 0
8125 False False False False 0
8126 True False False False 0
8127 True False False False 0
fuel_Diesel fuel_LPG fuel_Petrol seller_type_Individual \
0 True False False True
1 True False False True
2 False False True True
3 True False False True
4 False False True True
... ... ... ... ...
8123 False False True True
8124 True False False True
8125 True False False True
8126 True False False True
8127 True False False True
seller_type_Trustmark Dealer
0 False
1 False
2 False
3 False
4 False
... ...
8123 False
8124 False
8125 False
8126 False
8127 False
[7906 rows x 43 columns]
In [63]:
print(y)
0 13.017005
1 12.821261
2 11.970357
3 12.323860
4 11.775297
...
8123 12.676079
8124 11.813037
8125 12.853179
8126 12.577640
8127 12.577640
Name: selling_price, Length: 7906, dtype: float64
In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [65]:
Lr = LinearRegression()
In [66]:
Lr.fit(X_train, y_train)
Out[66]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [67]:
train_data = Lr.predict(X_train)
In [68]:
# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.0598 Training R^2: 0.9127
In [72]:
plt.scatter(y_train, train_data, alpha=0.5)
plt.xlabel("Actual Selling Price (log-transformed)")
plt.ylabel("Predicted Selling Price (log-transformed)")
plt.title("Actual vs Predicted Selling Price (Training Data)")
plt.show()
In [70]:
test_data = Lr.predict(X_test)
In [71]:
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.0647 Training R^2: 0.9052
Lasso Regresiion¶
In [73]:
# Lasso regression with alpha=0.1
Lasso_model = Lasso(alpha=0.1)
Lasso_model.fit(X_train, y_train)
Out[73]:
Lasso(alpha=0.1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Lasso(alpha=0.1)
In [75]:
train_data = Lasso_model.predict(X_train)
In [76]:
# Evaluate the model on the training data
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.1112 Training R^2: 0.8377
In [77]:
test_data = Lasso_model.predict(X_test)
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.1098 Training R^2: 0.8392
Ridge Regression¶
In [78]:
Ridge_model = Ridge(alpha=1.0)
Ridge_model.fit(X_train, y_train)
train_data = Ridge_model.predict(X_train)
In [79]:
mse_train = mean_squared_error(y_train, train_data)
r2_train = r2_score(y_train, train_data)
print(f"Training MSE: {mse_train:.4f}")
print(f"Training R^2: {r2_train:.4f}")
Training MSE: 0.0600 Training R^2: 0.9125
In [80]:
test_data = Ridge_model.predict(X_test)
In [81]:
mse_test = mean_squared_error(y_test, test_data)
r2_test = r2_score(y_test, test_data)
print(f"Training MSE: {mse_test:.4f}")
print(f"Training R^2: {r2_test:.4f}")
Training MSE: 0.0644 Training R^2: 0.9056
In [ ]:
In [ ]: